{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "imdb.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/lmoroney/tfbook/blob/master/chapter5/imdb.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zX4Kg8DUTKWO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "# https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Vzl0TH4svDDT",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install beautifulsoup4"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "D1J15Vh_1Jih",
        "colab_type": "code",
        "cellView": "both",
        "colab": {}
      },
      "source": [
        "try:\n",
        "  # %tensorflow_version only exists in Colab.\n",
        "  %tensorflow_version 2.x\n",
        "except Exception:\n",
        "  pass\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BOjujz601HcS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import tensorflow as tf\n",
        "from tensorflow import keras\n",
        "import tensorflow_datasets as tfds\n",
        "import numpy as np\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "otYi39HdKh2F",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "imdb_sentences = []\n",
        "train_data = tfds.as_numpy(tfds.load('imdb_reviews', split=\"train\"))\n",
        "for item in train_data:\n",
        "    imdb_sentences.append(str(item['text']))\n",
        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)\n",
        "tokenizer.fit_on_texts(imdb_sentences)\n",
        "sequences = tokenizer.texts_to_sequences(imdb_sentences)\n",
        "print(tokenizer.word_index)\n",
        "print(sequences[123])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KjVUJK2Dhh2l",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from bs4 import BeautifulSoup\n",
        "import string\n",
        "\n",
        "stopwords = [\"a\", \"about\", \"above\", \"after\", \"again\", \"against\", \"all\", \"am\", \"an\", \"and\", \"any\", \"are\", \"as\", \"at\",\n",
        "             \"be\", \"because\", \"been\", \"before\", \"being\", \"below\", \"between\", \"both\", \"but\", \"by\", \"could\", \"did\", \"do\",\n",
        "             \"does\", \"doing\", \"down\", \"during\", \"each\", \"few\", \"for\", \"from\", \"further\", \"had\", \"has\", \"have\", \"having\",\n",
        "             \"he\", \"hed\", \"hes\", \"her\", \"here\", \"heres\", \"hers\", \"herself\", \"him\", \"himself\", \"his\", \"how\",\n",
        "             \"hows\", \"i\", \"id\", \"ill\", \"im\", \"ive\", \"if\", \"in\", \"into\", \"is\", \"it\", \"its\", \"itself\",\n",
        "             \"lets\", \"me\", \"more\", \"most\", \"my\", \"myself\", \"nor\", \"of\", \"on\", \"once\", \"only\", \"or\", \"other\", \"ought\",\n",
        "             \"our\", \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"same\", \"she\", \"shed\", \"shell\", \"shes\", \"should\",\n",
        "             \"so\", \"some\", \"such\", \"than\", \"that\", \"thats\", \"the\", \"their\", \"theirs\", \"them\", \"themselves\", \"then\",\n",
        "             \"there\", \"theres\", \"these\", \"they\", \"theyd\", \"theyll\", \"theyre\", \"theyve\", \"this\", \"those\", \"through\",\n",
        "             \"to\", \"too\", \"under\", \"until\", \"up\", \"very\", \"was\", \"we\", \"wed\", \"well\", \"were\", \"weve\", \"were\",\n",
        "             \"what\", \"whats\", \"when\", \"whens\", \"where\", \"wheres\", \"which\", \"while\", \"who\", \"whos\", \"whom\", \"why\",\n",
        "             \"whys\", \"with\", \"would\", \"you\", \"youd\", \"youll\", \"youre\", \"youve\", \"your\", \"yours\", \"yourself\",\n",
        "             \"yourselves\"]\n",
        "\n",
        "table = str.maketrans('', '', string.punctuation)\n",
        "\n",
        "imdb_sentences = []\n",
        "train_data = tfds.as_numpy(tfds.load('imdb_reviews', split=\"train\"))\n",
        "for item in train_data:\n",
        "    sentence = str(item['text'].decode('UTF-8').lower())\n",
        "    sentence = sentence.replace(\",\", \" , \")\n",
        "    sentence = sentence.replace(\".\", \" . \")\n",
        "    sentence = sentence.replace(\"-\", \" - \")\n",
        "    sentence = sentence.replace(\"/\", \" / \")\n",
        "    soup = BeautifulSoup(sentence)\n",
        "    sentence = soup.get_text()\n",
        "    words = sentence.split()\n",
        "    filtered_sentence = \"\"\n",
        "    for word in words:\n",
        "        word = word.translate(table)\n",
        "        if word not in stopwords:\n",
        "            filtered_sentence = filtered_sentence + word + \" \"\n",
        "    imdb_sentences.append(filtered_sentence)\n",
        "\n",
        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000)\n",
        "tokenizer.fit_on_texts(imdb_sentences)\n",
        "sequences = tokenizer.texts_to_sequences(imdb_sentences)\n",
        "print(tokenizer.word_index)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "OO4yo8VPodVH",
        "colab": {}
      },
      "source": [
        "sentences = [\n",
        "    'Today is a sunny day',\n",
        "    'Today is a rainy day',\n",
        "    'Is it sunny today?'\n",
        "]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "5pNQboo9ofT_",
        "colab": {}
      },
      "source": [
        "\n",
        "sequences = tokenizer.texts_to_sequences(sentences)\n",
        "print(sequences)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5OH-8nqx4iZX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])\n",
        "\n",
        "decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in sequences[0]])\n",
        "\n",
        "print(decoded_review)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PshtmosjD21s",
        "colab_type": "text"
      },
      "source": [
        "# Trying Subwords\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1-tNv8WWD48T",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "(train_data, test_data), info = tfds.load(\n",
        "    # Use the version pre-encoded with an ~8k vocabulary.\n",
        "    'imdb_reviews/subwords8k', \n",
        "    # Return the train/test datasets as a tuple.\n",
        "    split = (tfds.Split.TRAIN, tfds.Split.TEST),\n",
        "    # Return (example, label) pairs from the dataset (instead of a dictionary).\n",
        "    as_supervised=True,\n",
        "    # Also return the `info` structure. \n",
        "    with_info=True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V1AsmTqaD8vL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "encoder = info.features['text'].encoder\n",
        "print ('Vocabulary size: {}'.format(encoder.vocab_size))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "E3E3S3frI57V",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\n",
        "print(encoder.decode([1, 2, 3]))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "61_o6cxOEBy6",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sample_string = 'Today is a sunny day'\n",
        "\n",
        "encoded_string = encoder.encode(sample_string)\n",
        "print ('Encoded string is {}'.format(encoded_string))\n",
        "\n",
        "original_string = encoder.decode(encoded_string)\n",
        "print ('The original string: \"{}\"'.format(original_string))\n",
        "\n",
        "assert original_string == sample_string"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0Cl-BHuTKotn",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(encoder.subwords[6426])"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}